# ------------------------------------------------------------------------------------------------
### Function to clean parliamentary election data
# ------------------------------------------------------------------------------------------------
clean_election_data <- function(file_path, file_type = "excel", sheet = 1, 
                               col_select, col_mapping, election_date, 
                               has_konf = TRUE, has_extra_cols = FALSE) {
  
  # Read data based on file type - read as text first to avoid type detection issues
  if (file_type == "excel") {
    data <- read_excel(file_path, sheet = sheet, col_types = "text") %>% 
      select(all_of(col_select))
  } else if (file_type == "csv") {
    data <- read_delim(file_path, delim = ";", 
                      col_select = all_of(col_select),
                      col_types = cols(.default = col_character()))
  }
  
  # Apply column mapping and transformations
  data <- data %>%
    select(!!!col_mapping) %>%
    # Convert to numeric, handling any non-numeric values
    mutate(across(c(n_elig_voters, nturnout, nballotscast, nvalid, npis, npo, npsl, nsld), 
                  ~ as.numeric(ifelse(. == "" | is.na(.), "0", .))),
           nkonf = if(has_konf) as.numeric(ifelse(nkonf == "" | is.na(nkonf), "0", nkonf)) else NA,
           edate = as.Date(election_date)) %>%
    # Remove overseas territories and ships
    filter(!grepl("statki|zagranica|zagraniczne", tolower(municipality)))
  
  return(data)
}

# ------------------------------------------------------------------------------------------------
### Define election configurations:
# ------------------------------------------------------------------------------------------------
# Configuration parameters for each election year's data cleaning
election_configs <- list(
  list(year = "2001", file = "../data/raw/2001_gmina_sejm.xls", 
       col_select = c(2:9, 13:15),
       col_mapping = list(id = 1, municipality = 2, n_elig_voters = 4, nturnout = 5, 
                         nballotscast = 6, nvalid = 7, npis = 9, npo = 11, 
                         npsl = 10, nsld = 8),
       election_date = "2001-09-23", has_konf = FALSE),
  
  list(year = "2005", file = "../data/raw/2005_gmina_sejm.xls",
       col_select = c(2:7, 9, 15:17, 19, 21),
       col_mapping = list(id = 1, municipality = 2, n_elig_voters = 4, nturnout = 5,
                         nballotscast = 6, nvalid = 7, npis = 8, npo = 10,
                         npsl = 11, nsld = 9, nkonf = 12),
       election_date = "2005-09-25", has_konf = TRUE),
  
  list(year = "2007", file = "../data/raw/2007_gmina_sejm.xls",
       col_select = c(2:3, 5:9, 12:14, 17),
       col_mapping = list(id = 1, municipality = 2, n_elig_voters = 4, nturnout = 5,
                         nballotscast = 6, nvalid = 7, npis = 8, npo = 9,
                         npsl = 10, nsld = 11),
       election_date = "2007-10-21", has_konf = FALSE),
  
  list(year = "2011", file = "../data/raw/2011_gmina_sejm.xls",
       col_select = c(2:3, 6, 8, 13, 15, 23, 25, 27, 29, 31),
       col_mapping = list(id = 1, municipality = 2, n_elig_voters = 3, nturnout = 4,
                         nballotscast = 5, nvalid = 6, npis = 7, npo = 10,
                         npsl = 9, nsld = 8, nkonf = 11),
       election_date = "2011-10-09", has_konf = TRUE),
  
  list(year = "2015", file = "../data/raw/2015_gmina_sejm.xls",
       col_select = c(2:3, 5, 8, 21, 26:28, 30:32),
       col_mapping = list(id = 1, municipality = 2, n_elig_voters = 3, nturnout = 4,
                         nballotscast = 5, nvalid = 6, npis = 7, npo = 8,
                         npsl = 10, nsld = 11, nkonf = 9),
       election_date = "2015-10-25", has_konf = TRUE),
  
  list(year = "2019", file = "../data/raw/2019_gmina_sejm.csv", file_type = "csv",
       col_select = c(1:4, 6, 8, 21, 26:27, 29:30, 32, 34),
       col_mapping = list(id = 1, municipality = 2, county = 3, voivodship = 4,
                         n_elig_voters = 5, nturnout = 6, nballotscast = 7, nvalid = 8,
                         npis = 12, npo = 9, npsl = 11, nsld = 13, nkonf = 10),
       election_date = "2019-10-13", has_konf = TRUE, has_extra_cols = TRUE)
)

# ------------------------------------------------------------------------------------------------
### Clean all parliamentary elections:
# ------------------------------------------------------------------------------------------------
# Apply cleaning function to all election years
sejm_data <- map(election_configs, function(config) {
  clean_election_data(
    file_path = config$file,
    file_type = if(is.null(config$file_type)) "excel" else config$file_type,
    col_select = config$col_select,
    col_mapping = config$col_mapping,
    election_date = config$election_date,
    has_konf = config$has_konf,
    has_extra_cols = if(is.null(config$has_extra_cols)) FALSE else config$has_extra_cols
  )
})

# Assign to individual variables for easier access
sejm_2001 <- sejm_data[[1]]
sejm_2005 <- sejm_data[[2]]
sejm_2007 <- sejm_data[[3]]
sejm_2011 <- sejm_data[[4]]
sejm_2015 <- sejm_data[[5]]
sejm_2019 <- sejm_data[[6]]

# ------------------------------------------------------------------------------------------------
### Update outdated municipality IDs:
# ------------------------------------------------------------------------------------------------ 

# Update the 2001 ids of other municipalities that changed codes in 2002-2003
sejm_2001 <- sejm_2001 %>%
  mutate(id = case_when(
    # 2002 changes
    id == "180101" ~ "182101", # Baligród
    id == "280602" ~ "281801", # Banie Mazurskie 
    id == "100601" ~ "102101", # m. Brzeziny
    id == "100604" ~ "102102", # gm. Brzeziny
    id == "280603" ~ "281901", # Budry
    id == "180102" ~ "182102", # Cisna
    id == "100605" ~ "102103", # Dmosin
    id == "320401" ~ "321801", # Dobra   
    id == "281301" ~ "281802", # Dubeninki
    id == "220902" ~ "221601", # Dzierzgoń    
    id == "281302" ~ "281803", # Gołdap
    id == "100606" ~ "102104", # Jeżów
    id == "180104" ~ "182103", # Lesko    
    id == "321407" ~ "321802", # Łobez
    id == "220905" ~ "221602", # Mikołajki
    id == "180106" ~ "182104", # Olszanica
    id == "280607" ~ "281902", # Pozezdrze
    id == "320505" ~ "321803", # Radowo Małe
    id == "320506" ~ "321804", # Resko 
    id == "100609" ~ "102105", # Rogów
    id == "080409" ~ "081201", # Sława
    id == "121202" ~ "240108", # Sławków
    id == "180107" ~ "182105", # Solina    
    id == "220909" ~ "221603", # Stary Dzierzgoń
    id == "220910" ~ "221604", # Stary Targ
    id == "141202" ~ "141215", # Sulejówek
    id == "080410" ~ "081202", # Szlichtyngowa  
    id == "220911" ~ "221605", # Sztum
    id == "280609" ~ "281903", # Węgorzewo
    id == "321412" ~ "321805", # Węgorzyno
    id == "080411" ~ "081203", # Wschowa
    # 2003 changes
    id == "180510" ~ "121616", # Szerzyny
    id == "140610" ~ "141806", # Tarczyn  
    TRUE ~ id
  ))

# Assign the new id of Rejowiec 060315 instead of 060608 before 2006
sejm_2001$id <- ifelse(sejm_2001$id == "060608", "060315", sejm_2001$id)
sejm_2005$id <- ifelse(sejm_2005$id == "060608", "060315", sejm_2005$id)

# Assign the new id of Wałbrzych 026501 instead of 022109 (2013) and 026301 (2003)
sejm_2001$id <- ifelse(sejm_2001$id == "026301", "026501", sejm_2001$id)
sejm_2005$id <- ifelse(sejm_2005$id == "022109", "026501", sejm_2005$id)
sejm_2007$id <- ifelse(sejm_2007$id == "022109", "026501", sejm_2007$id)
sejm_2011$id <- ifelse(sejm_2011$id == "022109", "026501", sejm_2011$id)

# ------------------------------------------------------------------------------------------------
### Harmonize municipal boundaries:
# ------------------------------------------------------------------------------------------------
# Function to aggregate Warsaw districts
aggregate_warsaw <- function(data, election_date, id_pattern = "^1465", extra_ids = NULL) {
  # Create filter condition for Warsaw districts
  if(!is.null(extra_ids)) {
    filter_condition <- paste0("grepl('", id_pattern, "', id) | id %in% c('", paste(extra_ids, collapse = "', '"), "')")
  } else {
    filter_condition <- paste0("grepl('", id_pattern, "', id)")
  }
  
  # Create aggregated Warsaw row
  warsaw_agg <- data %>%
    filter(!!rlang::parse_expr(filter_condition)) %>%
    summarize(
      edate = as.Date(election_date),
      id = "146501",
      municipality = "m. st. Warszawa",
      across(c(n_elig_voters, nturnout, nballotscast, nvalid, npis, npo, npsl, nsld, nkonf), \(x) sum(x, na.rm = TRUE))
    )
  
  # Return data with Warsaw districts removed and aggregated row added
  if(!is.null(extra_ids)) {
    data %>%
      filter(!grepl(id_pattern, id) & !id %in% extra_ids) %>%
      bind_rows(warsaw_agg)
  } else {
    data %>%
      filter(!grepl(id_pattern, id)) %>%
      bind_rows(warsaw_agg)
  }
}

# Apply Warsaw aggregation to all datasets
sejm_2001 <- aggregate_warsaw(sejm_2001, "2001-09-23", "^1431", "141203")
sejm_2007 <- aggregate_warsaw(sejm_2007, "2007-10-21", "^1465")
sejm_2011 <- aggregate_warsaw(sejm_2011, "2011-10-09", "^1465")
sejm_2015 <- aggregate_warsaw(sejm_2015, "2015-10-25", "^1465")
sejm_2019 <- aggregate_warsaw(sejm_2019, "2019-10-13", "^1465")

# ------------------------------------------------------------------------------------------------
### Handle municipal mergers and transfers:
# ------------------------------------------------------------------------------------------------
# Transform municipalities that merged, split or exchanged significant areas
# Logic: create comparables retrospectively for units that exist in 2019

# Columns with vote numbers to update
votes <- c("n_elig_voters", "nturnout", "nballotscast", "nvalid", "npis", "npo", "npsl", "nsld", "nkonf")

# Function: transfer a given share of votes from one municipality to another
transfer_votes <- function(data, donor_id, recipient_id, transfer_rate) {
  donor_values <- data %>% 
    filter(id == donor_id) %>% 
    select(n_elig_voters:nkonf) 
  
  transfer_amounts <- donor_values * transfer_rate
  
  data[data$id == donor_id, votes] <- 
    data[data$id == donor_id, votes] - transfer_amounts
  
  data[data$id == recipient_id, votes] <- 
    data[data$id == recipient_id, votes] + transfer_amounts
  
  return(data)
}

# Function: create new municipality from existing one
create_municipality <- function(data, source_id, new_id, new_name, transfer_rate = 0) {
  new_municipality <- data %>%
    filter(id == source_id) %>%
    mutate(
      id = new_id,
      municipality = new_name,
      across(n_elig_voters:nkonf, ~ 0)
    )
  
  # Add new municipality and transfer votes if needed
  data <- rbind(data, new_municipality)
  if(transfer_rate > 0) {
    data <- transfer_votes(data, source_id, new_id, transfer_rate)
  }
  return(data)
}

# Function: complete merger (100% transfer and remove donor)
complete_merger <- function(data, donor_id, recipient_id) {
  data <- transfer_votes(data, donor_id, recipient_id, 1) %>%
    filter(id != donor_id)
  return(data)
}

# Function: split municipality between two recipients
split_municipality <- function(data, source_id, recipient1_id, recipient2_id, split_rate) {
  source_values <- data[data$id == source_id, 3:10]
  split1 <- source_values * split_rate
  split2 <- source_values * (1 - split_rate)
  
  data[data$id == recipient1_id, 3:10] <- data[data$id == recipient1_id, 3:10] + split1
  data[data$id == recipient2_id, 3:10] <- data[data$id == recipient2_id, 3:10] + split2
  data <- data[data$id != source_id, ]
  
  return(data)
}

# Define municipal changes by time period
municipal_changes <- list(
  "pre_2005" = list(
    # Olszanica -> Ustrzyki Dolne: 17,7% of votes from Olszanica proportionally transferred to Ustrzyki Dolne for 2001
    list(type = "transfer", years = c("2001"), donor = "182104", recipient = "180108", rate = 0.177),
    # Solec nad Wisłą -> Łaziska: 5,9% of votes from Solec nad Wisłą proportionally transferred to Łaziska for 2001
    list(type = "transfer", years = c("2001"), donor = "140906", recipient = "061204", rate = 0.059)
  ),
  "pre_2007" = list(
    # Zabłudów -> Białystok: 24,8% of votes from Zabłudów proportionally transferred to Białystok for 2001-2005
    list(type = "transfer", years = c("2001", "2005"), donor = "200214", recipient = "206101", rate = 0.248),
    # Świlcza -> Rzeszów: 8,5% of votes from Świlcza proportionally transferred to Rzeszów for 2001-2005
    list(type = "transfer", years = c("2001", "2005"), donor = "181612", recipient = "186301", rate = 0.085)
  ),
  "pre_2011" = list(
    # Wiżajny -> Rutka-Tartak: 8,7% of votes from Wiżajny proportionally transferred to Rutka-Tartak for 2001-2007
    list(type = "transfer", years = c("2001", "2005", "2007"), donor = "201209", recipient = "201206", rate = 0.087),
    # Dukla -> Jaśliska: Jaśliska (~12.2% elig. voters) seceded from Dukla in 2010. Do same for before
    list(type = "create", years = c("2001", "2005", "2007"), source = "180702", new_id = "180710", new_name = "gm. Jaśliska", rate = 0.122)
  ),
  "pre_2015" = list(
    # gm. -> m. Zielona Góra: complete merger of the village into the namesake city
    list(type = "complete_merger", years = c("2001", "2005", "2007", "2011"), donor = "080910", recipient = "086201")
  ),
  "pre_2019" = list(
    # Ostrowice -> Drawsko & Złocieniec: Ostrowice split in 2019 btw. Drawsko (45.6%) and Złocieniec. Split prior years
    list(type = "split", years = c("2001", "2005", "2007", "2011", "2015"), source = "320304", recipient1 = "320302", recipient2 = "320306", rate = 0.456),
    # Dobrzeń Wielki -> Opole: 36,4% of votes from Dobrzeń Wielki proportionally transferred to Opole for 2001-2015
    list(type = "transfer", years = c("2001", "2005", "2007", "2011", "2015"), donor = "160903", recipient = "166101", rate = 0.364),
    # Komańcza -> Jaśliska: 5,2% of votes from Komańcza proportionally transferred to Jaśliska for 2001-2015
    list(type = "transfer", years = c("2001", "2005", "2007", "2011", "2015"), donor = "181704", recipient = "180710", rate = 0.052),
    # Dąbrowa -> Opole: 15,7% of votes from Dąbrowa proportionally transferred to Opole for 2001-2015
    list(type = "transfer", years = c("2001", "2005", "2007", "2011", "2015"), donor = "160902", recipient = "166101", rate = 0.157)
  )
)

# Apply all municipal changes
for(period in names(municipal_changes)) {
  for(change in municipal_changes[[period]]) {
    for(year in change$years) {
      df_name <- paste0("sejm_", year)
      df <- get(df_name)
      
      if(change$type == "transfer") {
        df <- transfer_votes(df, change$donor, change$recipient, change$rate)
      } else if(change$type == "create") {
        df <- create_municipality(df, change$source, change$new_id, change$new_name, change$rate)
      } else if(change$type == "complete_merger") {
        df <- complete_merger(df, change$donor, change$recipient)
      } else if(change$type == "split") {
        df <- split_municipality(df, change$source, change$recipient1, change$recipient2, change$rate)
      }
      
      assign(df_name, df)
    }
  }
}

# Result: 2477 gminas for each year

# Remove unnecessary objects
rm(votes, transfer_votes, create_municipality, complete_merger, split_municipality, municipal_changes, period, change, year, df_name, df)

# ------------------------------------------------------------------------------------------------
### Merge and save all election data:
# ------------------------------------------------------------------------------------------------
# Function to add administrative units from 2019 data
add_admin_units <- function(data) {
  data %>%
    merge(sejm_2019[,c("id", "county", "voivodship")], by = "id", all.x = TRUE)
}

# Harmonize the number of columns and the names of admin. units in all years
# Take the county and voivodship names from 2019
sejm_datasets <- list(sejm_2001, sejm_2005, sejm_2007, sejm_2011, sejm_2015, sejm_2019)
sejm_datasets <- map(sejm_datasets, add_admin_units)

# Combine all years and calculate percentage variables
parl_elections <- sejm_datasets %>%
  bind_rows() %>%
  # Ensure all numeric columns are actually numeric
  mutate(across(c(n_elig_voters, nturnout, nballotscast, nvalid, npis, npo, npsl, nsld, nkonf), as.numeric)) %>%
  # Calculate outcome variables
  mutate(
    turnout = 100*nturnout / n_elig_voters,
    valid = 100*nvalid / nballotscast,
    pis = 100*npis / nvalid,
    po = 100*npo / nvalid,
    psl = 100*npsl / nvalid,
    sld = 100*nsld / nvalid,
    konf = 100*nkonf / nvalid,
    non_pis = 100 - pis,
    pis_support = 100*npis / n_elig_voters,
    po_support = 100*npo / n_elig_voters,
    psl_support = 100*npsl / n_elig_voters,
    sld_support = 100*nsld / n_elig_voters,
    konf_support = 100*nkonf / n_elig_voters,
    non_pis_support = 100*(nvalid-npis) / n_elig_voters,
    # Generate county and voivodship ids from municipal ids
    id_county = substr(id, 1, 4),
    id_voivodship = substr(id, 1, 2)
  ) %>%
  # Drop absolute counts and reorder columns
  select(edate, id, municipality, id_county, county, id_voivodship, voivodship,
         turnout, pis, po, psl, sld, konf, non_pis,
         pis_support, po_support, psl_support, sld_support, konf_support, non_pis_support)

# Clean up
rm(sejm_2001, sejm_2005, sejm_2007, sejm_2011, sejm_2015, sejm_2019, sejm_datasets, add_admin_units)

# Save cleaned election data
save(parl_elections, file="../data/raw/parl_elections_2001-2019.Rda")

